This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
library(ggplot2)
library(tidyr)
# Create figure directory at ../../figures
figures_dir <- file.path("..", "..", "figures")
if (!dir.exists(figures_dir)) {
dir.create(figures_dir, recursive = TRUE)
}
file_path <- file.path("..", "..", "data", "plink2.eigenvec")
genomic_data <- read.table(file_path, header = TRUE)
cat("Number of lines in the file: ", nrow(data), "\n")
Number of lines in the file:
file_path <- file.path("..", "..", "data", "nda3.0.Rds")
data <- readRDS(file_path)
Warning: input string 'Compañía de Construcion' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8Warning: input string 'Compañia de limpieza' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8Warning: input string 'Compañia de limpieza- PMS' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8Warning: input string 'Demolición, ' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8Warning: input string 'Mom❤️😘' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8Warning: input string 'Grandma 👵 ' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8Warning: input string 'Her own 👟 ' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8Warning: input string 'Her own 👟 ' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
# Filter for baseline data only
data <- dplyr::filter(data, eventname == "baseline_year_1_arm_1")
cat("Number of lines after filtering by eventname: ", nrow(data), "\n")
Number of lines after filtering by eventname: 11878
# Drop the `eventname` column
#data <- dplyr::select(data, -eventname)
This takes quite a while — we will work with a subset of the data in the rest of this notebook.
Two variables seem similar: screentime_wkdy_1 and
screentime_wkdy_typical_hr.
data_filtered <- dplyr::select(data2, src_subject_id, screentime_wkdy_1, screentime_wkdy_typical_hr)
data_filtered <- data_filtered[!(is.na(data_filtered$screentime_wkdy_1) & is.na(data_filtered$screentime_wkdy_typical_hr)), ]
# Print the number of rows removed
cat("Number of rows after removal where both columns are NA:", nrow(data_filtered), "\n")
# Check how many rows remain with NA in either column
na_screentime_1 <- is.na(data_filtered$screentime_wkdy_1)
na_screentime_typical <- is.na(data_filtered$screentime_wkdy_typical_hr)
na_overlap <- sum(na_screentime_1 & na_screentime_typical)
na_in_one <- sum(na_screentime_1 | na_screentime_typical)
# Number of NAs in each column individually
na_screentime_1_count <- sum(na_screentime_1)
na_screentime_typical_count <- sum(na_screentime_typical)
cat("Number of NAs in screentime_wkdy_1:", na_screentime_1_count, "\n")
cat("Number of NAs in screentime_wkdy_typical_hr:", na_screentime_typical_count, "\n")
# Print remaining NA analysis
cat("Number of rows where both columns are NA (after cleaning):", na_overlap, "\n")
cat("Number of rows where at least one column is NA (after cleaning):", na_in_one, "\n")
It seems we only need the first one, screentime_wkdy_1.
We will add the equivalent for the weekend screen time,
screentime_wknd_7. For reading, we use
sports_activity_ss_read_hours_p.
column_names <- names(data)
search_columns <- function(search_string, column_names) {
# Perform regex search
matching_columns <- grep(search_string, column_names, value = TRUE)
return(matching_columns)
}
demographic_variables <- c("interview_age", "sex", "abcd_site", "mri_info_device.serial.number",
"married.bl", "household.income.bl", "high.educ.bl", "hisp", "rel_family_id")
phenotype_variables <- c("sports_activity_ss_read_hours_p",
"cbcl_scr_dsm5_adhd_t",
"screentime_wkdy_1",
"screentime_wknd_7"
)
nih_scores <- search_columns("nihtbx.*uncorrected", names(data))
quality_control_variables <- c("mrif_score", "fsqc_qc")
imaging_tabulated_variables <- search_columns("smri_(thick|area).*desikan", column_names)
cat("Number of imaging variables found: ", length(imaging_tabulated_variables), "\n")
Number of imaging variables found: 142
# Select relevant columns with the above lists
data_subset <- dplyr::select(data,
src_subject_id, eventname,
all_of(demographic_variables),
all_of(phenotype_variables),
all_of(nih_scores),
all_of(quality_control_variables),
all_of(imaging_tabulated_variables))
cat("\nMatrix size after column selection: ", dim(data_subset), "\n")
Matrix size after column selection: 11878 169
# Merge data_subset with genomic_data on src_subject_id
data_subset <- dplyr::left_join(data_subset, genomic_data, by = c("src_subject_id" = "IID"))
data_subset <- dplyr::select(data_subset, -FID)
cat("Matrix size after merging with genomic data: ", dim(data_subset), "\n")
Matrix size after merging with genomic data: 11878 189
data_subset <- dplyr::filter(data_subset, fsqc_qc == "accept")
cat("Number of lines after filtering by fsqc_qc: ", nrow(data_subset), "\n")
Number of lines after filtering by fsqc_qc: 11265
data_subset <- dplyr::filter(data_subset, mrif_score == "No abnormal findings" | mrif_score == "Normal anatomical variant of no clinical significance")
cat("Number of lines after filtering by mrif_score: ", nrow(data_subset), "\n")
Number of lines after filtering by mrif_score: 10783
# Filter NAs in all variables in "phenotype_variables"
for (variable in phenotype_variables) {
if (grepl("cbcl", variable)) {
next
}
data_subset <- dplyr::filter(data_subset, !is.na(data_subset[[variable]]))
cat("Number of lines after filtering NAs in", variable, ":", nrow(data_subset), "\n")
}
Number of lines after filtering NAs in sports_activity_ss_read_hours_p : 10038
Number of lines after filtering NAs in screentime_wkdy_1 : 10022
Number of lines after filtering NAs in screentime_wknd_7 : 10017
# Filter missing reading data
data_subset <- dplyr::filter(data_subset, !is.na(sports_activity_ss_read_hours_p))
cat("Number of lines after filtering missing reading data: ", nrow(data_subset), "\n")
Number of lines after filtering missing reading data: 10017
# Filter cases where reading is above 56 hours
data_subset <- dplyr::filter(data_subset, sports_activity_ss_read_hours_p <= 56)
cat("Number of lines after filtering by reading values above 56: ", nrow(data_subset), "\n")
Number of lines after filtering by reading values above 56: 9968
# Filter missing imaging data
for (variable in imaging_tabulated_variables) {
data_subset <- dplyr::filter(data_subset, !is.na(data_subset[[variable]]))
}
cat("Number of lines after filtering NAs in tabulated imaging data:", nrow(data_subset), "\n")
Number of lines after filtering NAs in tabulated imaging data: 9965
# Filter missing demographic data
for (variable in demographic_variables) {
data_subset <- dplyr::filter(data_subset, !is.na(data_subset[[variable]]))
cat("Number of lines after filtering NAs in", variable, ":", nrow(data_subset), "\n")
}
Number of lines after filtering NAs in interview_age : 9965
Number of lines after filtering NAs in sex : 9965
Number of lines after filtering NAs in abcd_site : 9965
Number of lines after filtering NAs in mri_info_device.serial.number : 9954
Number of lines after filtering NAs in married.bl : 9881
Number of lines after filtering NAs in household.income.bl : 9118
Number of lines after filtering NAs in high.educ.bl : 9114
Number of lines after filtering NAs in hisp : 9013
Number of lines after filtering NAs in rel_family_id : 9013
# Filter missing genomic data
data_subset <- dplyr::filter(data_subset, !is.na(PC1))
cat("Number of lines after filtering NAs in genomic data: ", nrow(data_subset), "\n")
Number of lines after filtering NAs in genomic data: 8127
# # Filter missing NIH scores (only done for Figure 1)
# for (variable in c("nihtbx_fluidcomp_uncorrected", "nihtbx_cryst_uncorrected", "nihtbx_totalcomp_uncorrected")) {
# data_subset <- dplyr::filter(data_subset, !is.na(data_subset[[variable]]))
# cat("Number of lines after filtering NAs in", variable, ":", nrow(data_subset), "\n")
# }
# “So it's 11,875 total > 11,810 (missing imaging data) > 10738 (imaging QC) > 10017 (missing behavioral data) > 9,968 (outlier filtering)”
#“I can confirm that imaging QC, outlier filtering, and missing behavioral data yields 9,968 subjects but after running DEAPext the final analysis consists of 8,125”
#“Thanks to Pierre's efforts we figured out that the drop from the 9000s to 8000s post-analysis is surprisingly from missing demographic data (most prominently household income and hispanic ethnicity but others as well). My pre-filtering steps only filtered for missing behavioral data which is why there was a discrepancy that only was revealed post-analysis.”
# Create screentime variable
# the screetime_kday/wdnd are levels: [ "None" "15 minutes" "30 minutes" "1 hour" "2 hours" "3 hours" "4+ hours" ] --> convert to pseudo continuous!
# Convert screentime levels to numeric values
screentime_levels <- c("None" = 0, "15 minutes" = 0.25, "30 minutes" = 0.5, "1 hour" = 1, "2 hours" = 2, "3 hours" = 3, "4+ hours" = 4)
data_subset$screentime_wkdy_1_num <- as.numeric(screentime_levels[data_subset$screentime_wkdy_1])
data_subset$screentime_wknd_7_num <- as.numeric(screentime_levels[data_subset$screentime_wknd_7])
data_subset$screentime <- (data_subset$screentime_wkdy_1_num * 5 + data_subset$screentime_wknd_7_num * 2) / 7
# Create the daily reading time variable
data_subset$readtime <- data_subset$sports_activity_ss_read_hours_p / 7
# Summarize each variable
for (variable in colnames(data_subset)) {
if (variable != "src_subject_id") {
cat("\nSummary for variable: ", variable, "\n")
print(summary(data_subset[[variable]]))
}
}
Summary for variable: eventname
1_year_follow_up_y_arm_1 2_year_follow_up_y_arm_1 baseline_year_1_arm_1 18_month_follow_up_arm_1 30_month_follow_up_arm_1 6_month_follow_up_arm_1
0 0 8127 0 0 0
Summary for variable: interview_age
Min. 1st Qu. Median Mean 3rd Qu. Max.
107.0 112.0 119.0 119.1 126.0 132.0
Summary for variable: sex
F M
3855 4272
Summary for variable: abcd_site
site01 site02 site03 site04 site05 site06 site07 site08 site09 site10 site11 site12 site13 site14 site15 site16 site17 site18 site19 site20 site21 site22
0 233 428 418 458 261 450 230 244 296 471 313 357 438 476 268 820 416 281 367 520 361 21
Summary for variable: mri_info_device.serial.number
HASH03db707f HASH11ad4ed5 HASH1314a204 HASH311170b9 HASH31ce566d HASH3935c89e HASH4036a433 HASH48f7cbc3 HASH4b0b8b05 HASH4d1ed7b1 HASH5ac2b20b HASH5b0cf1bb HASH5b2fcf80
156 304 361 400 260 35 805 0 22 291 294 366 413 241
HASH65b39280 HASH69f406fa HASH6b4422a7 HASH7911780b HASH7f91147d HASH96a0c182 HASHa3e45734 HASHb640a1b8 HASHc3bf3d9c HASHc9398971 HASHd422be27 HASHd7cb4c6d HASHdb2589d4 HASHe3ce02d3
222 98 233 262 68 440 281 337 343 186 326 385 417 86
HASHe4f6957a HASHe76e6d72 HASHfeb7e81a
313 16 166
Summary for variable: married.bl
no yes
2318 5809
Summary for variable: household.income.bl
[<50K] [>=50K & <100K] [>=100K]
2157 2365 3605
Summary for variable: high.educ.bl
< HS Diploma HS Diploma/GED Some College Bachelor Post Graduate Degree
279 594 1996 2224 3034
Summary for variable: hisp
No Yes
6563 1564
Summary for variable: rel_family_id
Min. 1st Qu. Median Mean 3rd Qu. Max.
3 3004 5926 5948 8928 11881
Summary for variable: sports_activity_ss_read_hours_p
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 3.000 4.227 6.000 50.000
Summary for variable: cbcl_scr_dsm5_adhd_t
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
50.00 50.00 50.00 53.11 55.00 80.00 1
Summary for variable: screentime_wkdy_1
None 0.25 30 minutes 1 hour 2 hours 3 hours 4+ hours
1050 1029 1903 2092 1104 463 486
Summary for variable: screentime_wknd_7
None < 30 minutes 30 minutes 1 hour 2 hours 3 hours 4+ hours
529 637 1169 2231 1690 818 1053
Summary for variable: nihtbx_picvocab_uncorrected
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
36.00 80.00 85.00 85.21 90.00 119.00 95
Summary for variable: nihtbx_flanker_uncorrected
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
54.00 90.00 96.00 94.47 101.00 116.00 99
Summary for variable: nihtbx_list_uncorrected
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
36.00 90.00 97.00 97.63 105.00 136.00 123
Summary for variable: nihtbx_cardsort_uncorrected
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
50.00 88.00 94.00 93.08 99.00 120.00 97
Summary for variable: nihtbx_pattern_uncorrected
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
36.00 80.00 88.00 88.37 99.00 140.00 111
Summary for variable: nihtbx_picture_uncorrected
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
76.0 95.0 103.0 103.4 112.0 136.0 101
Summary for variable: nihtbx_reading_uncorrected
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
63.00 88.00 91.00 91.28 95.00 119.00 103
Summary for variable: nihtbx_fluidcomp_uncorrected
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
44.00 86.00 93.00 92.35 99.00 131.00 150
Summary for variable: nihtbx_cryst_uncorrected
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
59.00 83.00 87.00 86.99 91.00 115.00 115
Summary for variable: nihtbx_totalcomp_uncorrected
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
46.00 82.00 88.00 87.06 93.00 117.00 154
Summary for variable: mrif_score
Image artifacts prevent radiology read No abnormal findings Normal anatomical variant of no clinical significance
0 6723 1404
Consider clinical referral Consider immediate clinical referral
0 0
Summary for variable: fsqc_qc
reject accept
0 8127
Summary for variable: smri_area_cort.desikan_bankssts.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
616 1063 1189 1200 1323 2029
Summary for variable: smri_area_cort.desikan_bankssts.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
596 993 1099 1110 1214 1843
Summary for variable: smri_area_cort.desikan_caudalanteriorcingulate.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
357.0 630.0 718.0 743.6 833.0 1598.0
Summary for variable: smri_area_cort.desikan_caudalanteriorcingulate.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
377.0 739.0 847.0 863.4 972.0 1722.0
Summary for variable: smri_area_cort.desikan_caudalmiddlefrontal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1146 2336 2640 2660 2957 4398
Summary for variable: smri_area_cort.desikan_caudalmiddlefrontal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
985 2122 2423 2451 2754 4856
Summary for variable: smri_area_cort.desikan_cuneus.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
590 1430 1563 1565 1703 2670
Summary for variable: smri_area_cort.desikan_cuneus.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
540 1473 1612 1606 1753 2581
Summary for variable: smri_area_cort.desikan_entorhinal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
175.0 359.0 412.0 419.5 470.0 992.0
Summary for variable: smri_area_cort.desikan_entorhinal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
144.0 301.0 356.0 365.3 419.0 1029.0
Summary for variable: smri_area_cort.desikan_frontalpole.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
68.0 180.0 204.0 205.3 229.0 400.0
Summary for variable: smri_area_cort.desikan_frontalpole.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
102.0 257.0 290.0 292.2 324.0 555.0
Summary for variable: smri_area_cort.desikan_fusiform.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2091 3270 3558 3579 3870 5822
Summary for variable: smri_area_cort.desikan_fusiform.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2000 3192 3477 3498 3793 5230
Summary for variable: smri_area_cort.desikan_inferiorparietal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2881 4732 5200 5251 5722 8900
Summary for variable: smri_area_cort.desikan_inferiorparietal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
3668 5696 6247 6288 6833 9975
Summary for variable: smri_area_cort.desikan_inferiortemporal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1962 3285 3635 3657 4004 5968
Summary for variable: smri_area_cort.desikan_inferiortemporal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1772 3129 3451 3458 3780 5395
Summary for variable: smri_area_cort.desikan_insula.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1532 2102 2255 2275 2427 4358
Summary for variable: smri_area_cort.desikan_insula.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1507 2091 2258 2280 2450 3804
Summary for variable: smri_area_cort.desikan_isthmuscingulate.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
545 970 1093 1116 1236 2911
Summary for variable: smri_area_cort.desikan_isthmuscingulate.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
560 916 1021 1039 1142 4212
Summary for variable: smri_area_cort.desikan_lateraloccipital.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
3053 4687 5122 5151 5575 8764
Summary for variable: smri_area_cort.desikan_lateraloccipital.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2733 4553 4983 5009 5429 7879
Summary for variable: smri_area_cort.desikan_lateralorbitofrontal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1817 2617 2821 2824 3028 4227
Summary for variable: smri_area_cort.desikan_lateralorbitofrontal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1703 2537 2747 2762 2973 4357
Summary for variable: smri_area_cort.desikan_lingual.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2099 3072 3344 3358 3628 5254
Summary for variable: smri_area_cort.desikan_lingual.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1777 3069 3328 3350 3615 5346
Summary for variable: smri_area_cort.desikan_medialorbitofrontal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1059 1661 1823 1839 2002 3016
Summary for variable: smri_area_cort.desikan_medialorbitofrontal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1149 1730 1873 1880 2023 2669
Summary for variable: smri_area_cort.desikan_middletemporal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1947 3111 3394 3422 3712 5668
Summary for variable: smri_area_cort.desikan_middletemporal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2238 3457 3763 3791 4106 6275
Summary for variable: smri_area_cort.desikan_paracentral.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
885 1274 1398 1422 1548 2646
Summary for variable: smri_area_cort.desikan_paracentral.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1031 1446 1603 1631 1781 3632
Summary for variable: smri_area_cort.desikan_parahippocampal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
414.0 680.0 752.0 767.4 834.0 2615.0
Summary for variable: smri_area_cort.desikan_parahippocampal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
422.0 661.0 731.0 746.0 813.5 2703.0
Summary for variable: smri_area_cort.desikan_parsopercularis.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1079 1626 1819 1847 2038 3461
Summary for variable: smri_area_cort.desikan_parsopercularis.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
843 1354 1525 1553 1723 2806
Summary for variable: smri_area_cort.desikan_parsorbitalis.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
306.0 612.0 671.0 673.2 732.0 1044.0
Summary for variable: smri_area_cort.desikan_parsorbitalis.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
471 756 832 834 906 1268
Summary for variable: smri_area_cort.desikan_parstriangularis.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
806 1287 1428 1441 1584 2550
Summary for variable: smri_area_cort.desikan_parstriangularis.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
856 1496 1676 1691 1869 2784
Summary for variable: smri_area_cort.desikan_pericalcarine.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
734 1342 1496 1504 1662 2598
Summary for variable: smri_area_cort.desikan_pericalcarine.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
652 1482 1644 1643 1809 2525
Summary for variable: smri_area_cort.desikan_postcentral.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2209 4163 4515 4572 4930 7847
Summary for variable: smri_area_cort.desikan_postcentral.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2056 3954 4300 4344 4692 7361
Summary for variable: smri_area_cort.desikan_posteriorcingulate.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
788 1180 1308 1330 1456 2575
Summary for variable: smri_area_cort.desikan_posteriorcingulate.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
752 1211 1351 1374 1506 2894
Summary for variable: smri_area_cort.desikan_precentral.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
3293 4705 5080 5137 5510 9186
Summary for variable: smri_area_cort.desikan_precentral.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
3288 4745 5136 5194 5569 9248
Summary for variable: smri_area_cort.desikan_precuneus.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2455 3840 4195 4210 4557 6470
Summary for variable: smri_area_cort.desikan_precuneus.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2356 4020 4405 4426 4812 6759
Summary for variable: smri_area_cort.desikan_rostralanteriorcingulate.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
346.0 784.0 892.0 897.7 1003.0 1604.0
Summary for variable: smri_area_cort.desikan_rostralanteriorcingulate.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
334.0 645.0 737.0 743.8 835.0 1337.0
Summary for variable: smri_area_cort.desikan_rostralmiddlefrontal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
3730 5944 6507 6547 7087 10588
Summary for variable: smri_area_cort.desikan_rostralmiddlefrontal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
4003 6140 6714 6777 7348 11975
Summary for variable: smri_area_cort.desikan_superiorfrontal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
4924 7322 7944 7995 8595 12349
Summary for variable: smri_area_cort.desikan_superiorfrontal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
4852 7087 7710 7766 8379 12218
Summary for variable: smri_area_cort.desikan_superiorparietal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
3428 5480 5956 6003 6489 9794
Summary for variable: smri_area_cort.desikan_superiorparietal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
3443 5523 5993 6033 6504 9138
Summary for variable: smri_area_cort.desikan_superiortemporal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2601 3784 4112 4140 4469 6635
Summary for variable: smri_area_cort.desikan_superiortemporal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2601 3616 3896 3928 4212 6193
Summary for variable: smri_area_cort.desikan_supramarginal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2112 3948 4371 4428 4850 8018
Summary for variable: smri_area_cort.desikan_supramarginal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2177 3702 4099 4153 4552 7199
Summary for variable: smri_area_cort.desikan_temporalpole.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
259 427 467 470 511 767
Summary for variable: smri_area_cort.desikan_temporalpole.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
177 369 406 411 449 706
Summary for variable: smri_area_cort.desikan_total
Min. 1st Qu. Median Mean 3rd Qu. Max.
129626 174368 186157 186792 198481 259003
Summary for variable: smri_area_cort.desikan_total.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
64276 86920 92840 93140 99020 131154
Summary for variable: smri_area_cort.desikan_total.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
65323 87379 93319 93652 99528 130937
Summary for variable: smri_area_cort.desikan_transversetemporal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
238.0 432.0 481.0 488.4 537.0 848.0
Summary for variable: smri_area_cort.desikan_transversetemporal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
182 320 356 361 397 667
Summary for variable: smri_thick_cort.desikan_bankssts.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.997 2.698 2.808 2.807 2.917 3.509
Summary for variable: smri_thick_cort.desikan_bankssts.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.199 2.783 2.899 2.899 3.016 3.604
Summary for variable: smri_thick_cort.desikan_caudalanteriorcingulate.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.997 2.723 2.867 2.880 3.030 4.023
Summary for variable: smri_thick_cort.desikan_caudalanteriorcingulate.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.606 2.731 2.744 2.865 3.744
Summary for variable: smri_thick_cort.desikan_caudalmiddlefrontal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.055 2.789 2.882 2.876 2.972 3.484
Summary for variable: smri_thick_cort.desikan_caudalmiddlefrontal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.108 2.759 2.854 2.846 2.944 3.349
Summary for variable: smri_thick_cort.desikan_cuneus.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.423 1.957 2.057 2.061 2.161 2.753
Summary for variable: smri_thick_cort.desikan_cuneus.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.447 1.977 2.082 2.082 2.191 2.765
Summary for variable: smri_thick_cort.desikan_entorhinal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.895 3.245 3.454 3.451 3.661 4.606
Summary for variable: smri_thick_cort.desikan_entorhinal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.621 3.347 3.586 3.573 3.825 4.648
Summary for variable: smri_thick_cort.desikan_frontalpole.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.443 3.008 3.212 3.211 3.419 4.735
Summary for variable: smri_thick_cort.desikan_frontalpole.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.433 2.983 3.168 3.173 3.369 4.415
Summary for variable: smri_thick_cort.desikan_fusiform.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.435 2.883 2.965 2.963 3.045 3.417
Summary for variable: smri_thick_cort.desikan_fusiform.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.269 2.885 2.968 2.966 3.051 3.409
Summary for variable: smri_thick_cort.desikan_inferiorparietal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.066 2.693 2.797 2.781 2.883 3.266
Summary for variable: smri_thick_cort.desikan_inferiorparietal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.140 2.727 2.825 2.809 2.908 3.268
Summary for variable: smri_thick_cort.desikan_inferiortemporal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.264 2.984 3.088 3.082 3.187 3.634
Summary for variable: smri_thick_cort.desikan_inferiortemporal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.249 3.012 3.115 3.106 3.212 3.666
Summary for variable: smri_thick_cort.desikan_insula.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.699 3.215 3.308 3.305 3.397 3.740
Summary for variable: smri_thick_cort.desikan_insula.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.646 3.197 3.298 3.296 3.399 3.835
Summary for variable: smri_thick_cort.desikan_isthmuscingulate.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.081 2.571 2.690 2.699 2.813 3.532
Summary for variable: smri_thick_cort.desikan_isthmuscingulate.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.106 2.526 2.640 2.651 2.762 3.513
Summary for variable: smri_thick_cort.desikan_lateraloccipital.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.811 2.233 2.342 2.334 2.439 2.910
Summary for variable: smri_thick_cort.desikan_lateraloccipital.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.807 2.294 2.405 2.395 2.505 2.936
Summary for variable: smri_thick_cort.desikan_lateralorbitofrontal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.099 2.892 2.992 2.990 3.091 3.566
Summary for variable: smri_thick_cort.desikan_lateralorbitofrontal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.907 2.863 2.970 2.965 3.069 3.510
Summary for variable: smri_thick_cort.desikan_lingual.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.663 2.107 2.197 2.198 2.287 2.720
Summary for variable: smri_thick_cort.desikan_lingual.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.724 2.144 2.232 2.235 2.326 2.754
Summary for variable: smri_thick_cort.desikan_mean
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.367 2.713 2.786 2.780 2.850 3.140
Summary for variable: smri_thick_cort.desikan_mean.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.337 2.717 2.789 2.783 2.855 3.149
Summary for variable: smri_thick_cort.desikan_mean.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.351 2.710 2.782 2.777 2.847 3.130
Summary for variable: smri_thick_cort.desikan_medialorbitofrontal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.823 2.615 2.728 2.729 2.840 3.366
Summary for variable: smri_thick_cort.desikan_medialorbitofrontal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.019 2.637 2.754 2.752 2.868 3.440
Summary for variable: smri_thick_cort.desikan_middletemporal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.211 3.098 3.213 3.193 3.314 3.798
Summary for variable: smri_thick_cort.desikan_middletemporal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.209 3.117 3.236 3.213 3.336 3.854
Summary for variable: smri_thick_cort.desikan_paracentral.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.084 2.649 2.753 2.750 2.852 3.386
Summary for variable: smri_thick_cort.desikan_paracentral.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.156 2.657 2.753 2.751 2.845 3.282
Summary for variable: smri_thick_cort.desikan_parahippocampal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.108 2.803 3.002 2.999 3.193 4.325
Summary for variable: smri_thick_cort.desikan_parahippocampal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.026 2.791 2.954 2.954 3.116 3.941
Summary for variable: smri_thick_cort.desikan_parsopercularis.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.216 2.830 2.920 2.916 3.006 3.475
Summary for variable: smri_thick_cort.desikan_parsopercularis.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.241 2.810 2.908 2.907 3.002 3.441
Summary for variable: smri_thick_cort.desikan_parsorbitalis.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.205 2.996 3.130 3.130 3.265 3.971
Summary for variable: smri_thick_cort.desikan_parsorbitalis.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.225 2.967 3.101 3.102 3.237 4.117
Summary for variable: smri_thick_cort.desikan_parstriangularis.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.161 2.725 2.825 2.823 2.925 3.424
Summary for variable: smri_thick_cort.desikan_parstriangularis.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.740 2.707 2.809 2.805 2.911 3.506
Summary for variable: smri_thick_cort.desikan_pericalcarine.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.287 1.658 1.758 1.762 1.857 2.491
Summary for variable: smri_thick_cort.desikan_pericalcarine.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.288 1.651 1.752 1.755 1.852 2.366
Summary for variable: smri_thick_cort.desikan_postcentral.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.778 2.229 2.330 2.331 2.434 3.055
Summary for variable: smri_thick_cort.desikan_postcentral.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.746 2.195 2.298 2.301 2.404 2.976
Summary for variable: smri_thick_cort.desikan_posteriorcingulate.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.228 2.671 2.765 2.773 2.868 3.549
Summary for variable: smri_thick_cort.desikan_posteriorcingulate.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.038 2.638 2.721 2.729 2.810 3.388
Summary for variable: smri_thick_cort.desikan_precentral.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.812 2.716 2.811 2.798 2.895 3.241
Summary for variable: smri_thick_cort.desikan_precentral.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.918 2.683 2.775 2.762 2.855 3.257
Summary for variable: smri_thick_cort.desikan_precuneus.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.193 2.631 2.715 2.713 2.797 3.182
Summary for variable: smri_thick_cort.desikan_precuneus.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.008 2.642 2.726 2.721 2.806 3.179
Summary for variable: smri_thick_cort.desikan_rostralanteriorcingulate.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.229 3.035 3.169 3.168 3.306 4.135
Summary for variable: smri_thick_cort.desikan_rostralanteriorcingulate.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.092 2.920 3.062 3.062 3.200 3.899
Summary for variable: smri_thick_cort.desikan_rostralmiddlefrontal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.919 2.647 2.745 2.733 2.833 3.208
Summary for variable: smri_thick_cort.desikan_rostralmiddlefrontal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.962 2.600 2.695 2.688 2.789 3.200
Summary for variable: smri_thick_cort.desikan_superiorfrontal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.483 3.047 3.147 3.142 3.242 3.767
Summary for variable: smri_thick_cort.desikan_superiorfrontal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.509 3.012 3.104 3.103 3.198 3.766
Summary for variable: smri_thick_cort.desikan_superiorparietal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.847 2.410 2.503 2.494 2.588 2.967
Summary for variable: smri_thick_cort.desikan_superiorparietal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.898 2.411 2.504 2.495 2.588 3.073
Summary for variable: smri_thick_cort.desikan_superiortemporal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.344 3.004 3.121 3.111 3.228 3.680
Summary for variable: smri_thick_cort.desikan_superiortemporal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.339 3.027 3.132 3.129 3.236 3.674
Summary for variable: smri_thick_cort.desikan_supramarginal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.114 2.779 2.884 2.868 2.979 3.350
Summary for variable: smri_thick_cort.desikan_supramarginal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.107 2.769 2.890 2.863 2.982 3.337
Summary for variable: smri_thick_cort.desikan_temporalpole.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.831 3.640 3.822 3.811 4.002 4.734
Summary for variable: smri_thick_cort.desikan_temporalpole.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.358 3.760 3.963 3.942 4.151 4.747
Summary for variable: smri_thick_cort.desikan_transversetemporal.lh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.905 2.616 2.753 2.754 2.889 3.731
Summary for variable: smri_thick_cort.desikan_transversetemporal.rh
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.987 2.643 2.777 2.776 2.910 3.602
Summary for variable: PC1
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.0064216 -0.0057921 -0.0053010 -0.0006869 -0.0017259 0.0296915
Summary for variable: PC2
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.0438964 0.0015891 0.0047200 0.0004142 0.0053848 0.0071839
Summary for variable: PC3
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.0327854 -0.0006484 0.0001592 0.0001899 0.0008417 0.0539392
Summary for variable: PC4
Min. 1st Qu. Median Mean 3rd Qu. Max.
-5.063e-03 -1.321e-03 -8.795e-04 4.674e-05 -3.946e-04 2.706e-01
Summary for variable: PC5
Min. 1st Qu. Median Mean 3rd Qu. Max.
-6.119e-02 -1.618e-03 2.765e-03 1.377e-05 6.196e-03 1.614e-02
Summary for variable: PC6
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.0554863 -0.0037522 0.0011844 0.0000564 0.0054741 0.0288450
Summary for variable: PC7
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.0922031 -0.0067865 -0.0017109 -0.0001939 0.0046775 0.0427558
Summary for variable: PC8
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.5157770 -0.0017237 -0.0000645 0.0000139 0.0014557 0.3234070
Summary for variable: PC9
Min. 1st Qu. Median Mean 3rd Qu. Max.
-7.875e-02 -6.520e-03 -4.363e-04 6.124e-05 5.852e-03 1.354e-01
Summary for variable: PC10
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.1632660 -0.0021802 0.0005472 -0.0000082 0.0032797 0.4396320
Summary for variable: PC11
Min. 1st Qu. Median Mean 3rd Qu. Max.
-2.753e-01 -5.834e-03 1.820e-04 9.679e-05 6.103e-03 8.028e-02
Summary for variable: PC12
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.4443500 -0.0013099 -0.0000604 0.0000175 0.0012488 0.5326450
Summary for variable: PC13
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.3194340 -0.0026527 0.0000338 -0.0000508 0.0027588 0.2139670
Summary for variable: PC14
Min. 1st Qu. Median Mean 3rd Qu. Max.
-2.183e-01 -5.278e-03 -1.686e-04 -1.528e-05 4.622e-03 1.429e-01
Summary for variable: PC15
Min. 1st Qu. Median Mean 3rd Qu. Max.
-7.143e-02 -5.418e-03 -1.091e-04 -1.742e-05 5.233e-03 7.992e-02
Summary for variable: PC16
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.3672920 -0.0027239 0.0000621 0.0000509 0.0029866 0.2636760
Summary for variable: PC17
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.3897230 -0.0021331 -0.0002144 0.0000662 0.0017630 0.2317530
Summary for variable: PC18
Min. 1st Qu. Median Mean 3rd Qu. Max.
-8.897e-02 -5.178e-03 2.432e-04 -7.027e-05 5.797e-03 6.453e-02
Summary for variable: PC19
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.319e-01 -4.848e-03 2.740e-04 -2.388e-05 5.200e-03 1.277e-01
Summary for variable: PC20
Min. 1st Qu. Median Mean 3rd Qu. Max.
-3.042e-01 -3.843e-03 -3.923e-05 -2.761e-05 3.650e-03 9.395e-02
Summary for variable: screentime_wkdy_1_num
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.250 1.000 1.088 2.000 4.000
Summary for variable: screentime_wknd_7_num
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.500 1.000 1.602 2.000 4.000
Summary for variable: screentime
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.500 1.000 1.235 1.714 4.000
Summary for variable: readtime
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.4286 0.6038 0.8571 7.1429
data_filtered <- data_subset
# Summary statistics for sports_activity_ss_read_hours_p
cat("Summary statistics for sports_activity_ss_read_hours_p: \n")
Summary statistics for sports_activity_ss_read_hours_p:
summary(data_filtered$sports_activity_ss_read_hours_p)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 3.000 4.227 6.000 50.000
# Summary statistics for screentime variables
cat("Summary statistics for screentime_wkdy_1: \n")
Summary statistics for screentime_wkdy_1:
summary(data_filtered$screentime_wkdy_1)
None 0.25 30 minutes 1 hour 2 hours 3 hours 4+ hours
1050 1029 1903 2092 1104 463 486
cat("Summary statistics for screentime_wknd_7: \n")
Summary statistics for screentime_wknd_7:
summary(data_filtered$screentime_wknd_7)
None < 30 minutes 30 minutes 1 hour 2 hours 3 hours 4+ hours
529 637 1169 2231 1690 818 1053
## Visualize the distribution of sports_activity_ss_read_hours_p with log scale on y-axis
# Calculate the number of cases in each category
zero_to_eight <- sum(data$sports_activity_ss_read_hours_p / 7 <= 8, na.rm = TRUE)
eight_to_fourteen <- sum(data$sports_activity_ss_read_hours_p / 7 > 8 & data$sports_activity_ss_read_hours_p / 7 <= 14, na.rm = TRUE)
four_to_eight <- sum(data$sports_activity_ss_read_hours_p / 7 > 4 & data$sports_activity_ss_read_hours_p / 7 <= 8, na.rm = TRUE)
more_than_fourteen <- sum(data$sports_activity_ss_read_hours_p / 7 > 14, na.rm = TRUE)
cat("Number of cases with 0-8 hours per day:", zero_to_eight, "\n")
Number of cases with 0-8 hours per day: 10993
cat("Number of cases with 4-8 hours per day:", four_to_eight, "\n")
Number of cases with 4-8 hours per day: 73
cat("Number of cases with 8-14 hours per day:", eight_to_fourteen, "\n")
Number of cases with 8-14 hours per day: 21
cat("Number of cases with more than 14 hours per day:", more_than_fourteen, "\n")
Number of cases with more than 14 hours per day: 33
# Add text annotations to the plot
ggplot(data, aes(x = sports_activity_ss_read_hours_p / 7)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
scale_y_log10() +
labs(title = "Distribution of Reading Hours per Day (before any filtering, including QC and NAs)",
x = "Reading Hours per Day",
y = "Count") +
theme_minimal() +
geom_vline(xintercept = c(8, 14), color = "black", linetype = "dashed", size = 1) +
annotate("text", x = 4, y = 1200, label = paste("0-8 hours:", zero_to_eight), color = "darkgray") +
annotate("text", x = 11, y = 1200, label = paste("8-14 hours:", eight_to_fourteen), color = "darkgray") +
annotate("text", x = 18, y = 1200, label = paste(">14 hours:", more_than_fourteen), color = "darkgray")
# Combine the screentime data for weekday and weekend
# Change specific values in screentime columns
levels(data_filtered$screentime_wkdy_1)[levels(data_filtered$screentime_wkdy_1) == "0.25"] <- "15 minutes"
levels(data_filtered$screentime_wknd_7)[levels(data_filtered$screentime_wknd_7) == "< 30 minutes"] <- "15 minutes"
# Combine the screentime data for weekday and weekend
data_long <- tidyr::pivot_longer(data_filtered, cols = c(screentime_wkdy_1, screentime_wknd_7),
names_to = "day_type", values_to = "screentime_hours")
# Create a combined bar plot
ggplot(data_long, aes(x = factor(screentime_hours), fill = day_type)) +
geom_bar(position = "dodge") +
labs(title = "Distribution of Screentime on Weekdays and Weekends",
x = "Screentime (hours)",
y = "Count") +
scale_fill_manual(values = c("lightgreen", "lightcoral"),
labels = c("Weekday", "Weekend")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Filter cases where readtime is above 8 hours per day
data_filtered <- dplyr::filter(data_filtered, readtime <= 8)
cat("Number of lines after filtering by readtime: ", nrow(data_filtered), "\n")
Number of lines after filtering by readtime: 8127
# Function to compute R-squared and p-value
compute_regression_stats <- function(model) {
r_squared <- summary(model)$r.squared
p_value <- summary(model)$coefficients[2, 4]
return(list(r_squared = r_squared, p_value = p_value))
}
# `cbcl_scr_dsm5_adhd_t` vs `sports_activity_ss_read_hours_p`
model <- lm(cbcl_scr_dsm5_adhd_t ~ readtime, data = data_filtered)
res = compute_regression_stats(model)
annotation <- paste("R^2: ", round(res$r_squared, 2), "\np-value: ", format.pval(res$p_value, digits = 2))
cat("Coefficients:\n")
Coefficients:
print(summary(model)$coefficients)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 53.3916355 0.08074427 661.243621 0.000000e+00
readtime -0.4620706 0.08631384 -5.353377 8.868582e-08
# Plot
ggplot(data_filtered, aes(x = sports_activity_ss_read_hours_p, y = cbcl_scr_dsm5_adhd_t)) +
geom_point(color = "blue", alpha = 0.5, position = position_jitter(width = 1, height = 1)) +
labs(title = "Scatter plot of ADHD Scores vs Reading Hours (Filtered)",
x = "Reading Hours per Week",
y = "ADHD Scores") +
theme_minimal() +
geom_smooth(method = "lm", color = "red", se = FALSE) +
annotate("text", x = 35,
y = 56,
label = annotation,
color = "black")
# `cbcl_scr_dsm5_adhd_t` vs `screentime`
model <- lm(cbcl_scr_dsm5_adhd_t ~ screentime, data = data_filtered)
res = compute_regression_stats(model)
annotation <- paste("R^2: ", round(res$r_squared, 2), "\np-value: ", format.pval(res$p_value, digits = 2))
cat("Coefficients:\n")
Coefficients:
print(summary(model)$coefficients)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 52.4515320 0.09671206 542.347380 0.000000e+00
screentime 0.5354055 0.06044013 8.858444 9.841849e-19
# Plot
ggplot(data_filtered, aes(x = screentime, y = cbcl_scr_dsm5_adhd_t)) +
geom_point(color = "blue", alpha = 0.5, position = position_jitter(width = 0.08, height = 1)) +
labs(title = "Scatter plot of ADHD Scores vs Screentime Hours (Filtered)",
x = "Screentime Hours per Week",
y = "ADHD Scores") +
theme_minimal() +
geom_smooth(method = "lm", color = "red", se = FALSE) +
annotate("text", x = 3,
y = 57,
label = annotation,
color = "black")
# Summary statistics for NIH Toolbox Scores
cat("Summary statistics for nihtbx_cryst_uncorrected: \n")
Summary statistics for nihtbx_cryst_uncorrected:
summary(data_filtered$nihtbx_cryst_uncorrected)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
59.00 83.00 87.00 86.99 91.00 115.00 115
cat("Summary statistics for nihtbx_fluidcomp_uncorrected: \n")
Summary statistics for nihtbx_fluidcomp_uncorrected:
summary(data_filtered$nihtbx_fluidcomp_uncorrected)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
44.00 86.00 93.00 92.35 99.00 131.00 150
cat("Summary statistics for nihtbx_totalcomp_uncorrected: \n")
Summary statistics for nihtbx_totalcomp_uncorrected:
summary(data_filtered$nihtbx_totalcomp_uncorrected)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
46.00 82.00 88.00 87.06 93.00 117.00 154
# Add a new column to categorize readtime as <4 hours or >=4 hours
data_filtered$readtime_category <- ifelse(data_filtered$readtime < 4, "<4 hours", ">=4 hours")
## Compute R-squared and p-values for all 6 regressions
# List to store results
regression_results <- list()
# Regression 1: nihtbx_cryst_uncorrected vs readtime for readtimes < 4 hours
model1 <- lm(nihtbx_cryst_uncorrected ~ readtime, data = dplyr::filter(data_filtered, readtime < 4))
regression_results[["Crystallized Cognition vs Reading Time (<4 hours)"]] <- compute_regression_stats(model1)
# Regression 2: nihtbx_cryst_uncorrected vs readtime for readtimes >= 4 hours
model2 <- lm(nihtbx_cryst_uncorrected ~ readtime, data = dplyr::filter(data_filtered, readtime >= 4))
regression_results[["Crystallized Cognition vs Reading Time (>=4 hours)"]] <- compute_regression_stats(model2)
# Regression 3: nihtbx_fluidcomp_uncorrected vs readtime for readtimes < 4 hours
model3 <- lm(nihtbx_fluidcomp_uncorrected ~ readtime, data = dplyr::filter(data_filtered, readtime < 4))
regression_results[["Fluid Cognition vs Reading Time (<4 hours)"]] <- compute_regression_stats(model3)
# Regression 4: nihtbx_fluidcomp_uncorrected vs readtime for readtimes >= 4 hours
model4 <- lm(nihtbx_fluidcomp_uncorrected ~ readtime, data = dplyr::filter(data_filtered, readtime >= 4))
regression_results[["Fluid Cognition vs Reading Time (>=4 hours)"]] <- compute_regression_stats(model4)
# Regression 5: nihtbx_totalcomp_uncorrected vs readtime for readtimes < 4 hours
model5 <- lm(nihtbx_totalcomp_uncorrected ~ readtime, data = dplyr::filter(data_filtered, readtime < 4))
regression_results[["Total Cognition vs Reading Time (<4 hours)"]] <- compute_regression_stats(model5)
# Regression 6: nihtbx_totalcomp_uncorrected vs readtime for readtimes >= 4 hours
model6 <- lm(nihtbx_totalcomp_uncorrected ~ readtime, data = dplyr::filter(data_filtered, readtime >= 4))
regression_results[["Total Cognition vs Reading Time (>=4 hours)"]] <- compute_regression_stats(model6)
# Print results
for (regression in names(regression_results)) {
cat(regression, "\n")
cat("R-squared: ", regression_results[[regression]]$r_squared, "\n")
cat("p-value: ", regression_results[[regression]]$p_value, "\n\n")
}
Crystallized Cognition vs Reading Time (<4 hours)
R-squared: 0.1256564
p-value: 2.589591e-234
Crystallized Cognition vs Reading Time (>=4 hours)
R-squared: 0.02890613
p-value: 0.2190304
Fluid Cognition vs Reading Time (<4 hours)
R-squared: 0.03680869
p-value: 1.424086e-66
Fluid Cognition vs Reading Time (>=4 hours)
R-squared: 6.070349e-05
p-value: 0.9558442
Total Cognition vs Reading Time (<4 hours)
R-squared: 0.09007422
p-value: 1.511456e-164
Total Cognition vs Reading Time (>=4 hours)
R-squared: 0.00211991
p-value: 0.7433841
## Combine the three scatter plots into one plot with facets
# Prepare data for faceting
data_filtered_long <- data_filtered %>%
tidyr::pivot_longer(cols = c(nihtbx_cryst_uncorrected, nihtbx_fluidcomp_uncorrected, nihtbx_totalcomp_uncorrected),
names_to = "variable", values_to = "score")
# Add regression results to the data for annotation
data_filtered_long <- data_filtered_long %>%
dplyr::mutate(
regression_label = dplyr::case_when(
variable == "nihtbx_cryst_uncorrected" & readtime_category == "<4 hours" ~ paste("R^2: ", round(regression_results[["Crystallized Cognition vs Reading Time (<4 hours)"]]$r_squared, 3), "\np-value: ", format.pval(regression_results[["Crystallized Cognition vs Reading Time (<4 hours)"]]$p_value, digits = 2)),
variable == "nihtbx_cryst_uncorrected" & readtime_category == ">=4 hours" ~ paste("R^2: ", round(regression_results[["Crystallized Cognition vs Reading Time (>=4 hours)"]]$r_squared, 3), "\np-value: ", format.pval(regression_results[["Crystallized Cognition vs Reading Time (>=4 hours)"]]$p_value, digits = 2)),
variable == "nihtbx_fluidcomp_uncorrected" & readtime_category == "<4 hours" ~ paste("R^2: ", round(regression_results[["Fluid Cognition vs Reading Time (<4 hours)"]]$r_squared, 3), "\np-value: ", format.pval(regression_results[["Fluid Cognition vs Reading Time (<4 hours)"]]$p_value, digits = 2)),
variable == "nihtbx_fluidcomp_uncorrected" & readtime_category == ">=4 hours" ~ paste("R^2: ", round(regression_results[["Fluid Cognition vs Reading Time (>=4 hours)"]]$r_squared, 3), "\np-value: ", format.pval(regression_results[["Fluid Cognition vs Reading Time (>=4 hours)"]]$p_value, digits = 2)),
variable == "nihtbx_totalcomp_uncorrected" & readtime_category == "<4 hours" ~ paste("R^2: ", round(regression_results[["Total Cognition vs Reading Time (<4 hours)"]]$r_squared, 3), "\np-value: ", format.pval(regression_results[["Total Cognition vs Reading Time (<4 hours)"]]$p_value, digits = 2)),
variable == "nihtbx_totalcomp_uncorrected" & readtime_category == ">=4 hours" ~ paste("R^2: ", round(regression_results[["Total Cognition vs Reading Time (>=4 hours)"]]$r_squared, 3), "\np-value: ", format.pval(regression_results[["Total Cognition vs Reading Time (>=4 hours)"]]$p_value, digits = 2))
)
)
# Plot with facets
combined_plot <- ggplot(data_filtered_long, aes(x = readtime, y = score)) +
# Points with fill aesthetic
geom_point(alpha = 0.2, position = position_jitter(width = 0.05, height = 1),
aes(fill = readtime_category), shape = 21, size = 2, stroke = 0) +
# Fit lines with color aesthetic
geom_smooth(method = "lm", se = TRUE, size = 2, linetype = "solid",
aes(color = readtime_category)) +
# Separate scales for points (fill) and lines (color)
scale_fill_manual(
values = c("<4 hours" = "steelblue", ">=4 hours" = "darkseagreen") # Points' fill colors
) +
scale_color_manual(
values = c("<4 hours" = "darkblue", ">=4 hours" = "darkgreen") # Lines' colors
) +
# Labels
labs(
# title = "Scatter plots of NIH Toolbox un-corrected Cognition Scores vs Reading Hours",
plot.title = element_text(hjust = 0.5),
x = "Reading Hours per Day",
y = "Raw Score",
) +
# Facets
facet_wrap(~ variable, scales = "free_x", ncol = 3,
labeller = as_labeller(c(
nihtbx_cryst_uncorrected = "Crystallized",
nihtbx_fluidcomp_uncorrected = "Fluid",
nihtbx_totalcomp_uncorrected = "Total"
))) +
# Y-axis limits
ylim(45, 130) +
# Theme adjustments
theme_minimal() +
theme(
legend.position = "none",
aspect.ratio = 0.5,
plot.title = element_text(size = 20),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 14),
strip.text = element_text(size = 18)
) +
# Add regression results as text annotations
geom_text(data = data_filtered_long %>% dplyr::filter(readtime_category == "<4 hours"),
aes(x = 2, y = 50, label = regression_label),
color = "black", size = 3, parse = FALSE) +
geom_text(data = data_filtered_long %>% dplyr::filter(readtime_category == ">=4 hours"),
aes(x = 6, y = 50, label = regression_label),
color = "black", size = 3, parse = FALSE)
print(combined_plot)
# Save the combined plot to the figures directory
output_file <- file.path(figures_dir, "combined_uncorrected_cognition_scores_vs_reading_hours2.png")
ggsave(output_file, plot = combined_plot, width = 20, height = 6, bg = "white")
cat("Combined plot saved to:", output_file, "\n")
Combined plot saved to: ../../figures/combined_uncorrected_cognition_scores_vs_reading_hours2.png
# Scatter plot: `nihtbx_totalcomp_uncorrected` vs `readtime`
ggplot(data_filtered, aes(x = cbcl_scr_dsm5_adhd_t, y = nihtbx_totalcomp_uncorrected)) +
geom_point(alpha = 0.5, position = position_jitter(width = 0.05, height = 0.4)) +
labs(title = "Scatter plot of Total Cognition Scores vs ADHD t-scores",
x = "ADHD t-scores",
y = "Total Cognition Scores") +
theme_minimal() +
geom_smooth(method = "lm", se = TRUE)
Here, we need to export the matrix in a specific format, with
src_subject_id, eventname,
rel_family_id, age, other predictor variables.
We will use the data_subset dataframe for this purpose. We
also need to dummy encode the categorical variables, check for rank
deficiency, remove the redundant columns, add the intercept column, and
save the matrix to a tab-separated file.
#install.packages("psych")
#install.packages("ordinal")
#install.packages("pracma")
source("../../code/matlab/cmig_tools/cmig_tools_utils/r/makeDesign.R")
vars_of_interest = c("readtime", "screentime")
outfile <- "../../data/derived/design_matrix_readtime+screentime.txt"
time <- c("baseline_year_1_arm_1")
contvar <- c("interview_age", vars_of_interest, paste0("PC", 1:10))
catvar <- c("sex", "abcd_site", "married.bl", "household.income.bl", "high.educ.bl", "hisp", "mri_info_device.serial.number")
demean <- FALSE
# check that all variables are in data_subset
for (var in c(contvar, catvar)) {
if (!var %in% colnames(data_subset)) {
stop(paste("Variable", var, "not found in data_subset"))
}
}
# Call the makeDesign function
design_matrix <- makeDesign(
nda = data_subset,
outfile = outfile,
time = time,
contvar = contvar,
catvar = catvar,
demean = demean
)
design is column rank deficient so dropping 5 coef